We estimate and compare seven human mobility models (CDE, CDP, CDO, ERad, Imp, IO, Sto) to the 2011 census commuting flow data for England (\(\Omega^{CE}\)), Wales (\(\Omega^{CW}\)), Scotland (\(\Omega^{S}\)) and Northern Ireland (\(\Omega^{NI}\)). We estimate posterior distributions using hamiltonian MCMC (as implemented by the Stan package http://mc-stan.org/). To assess model fit and provide a basis for model selection we use approximate leave-one-out cross validation as implemented in the loo package (doi:10.1007/s11222-016-9696-4).
n_commuters = UKcensus_flows_lads %>% group_by(from) %>% summarise(commuters=sum(freq))
n_movers = UKcensus_flows_lads %>% filter(from!=to) %>% group_by(from) %>% summarise(commuters=sum(freq))
summary <- n_commuters %>% inner_join(n_movers,by='from')
summary <- summary %>% mutate(country=substr(from,1,1))
summary <- summary %>% group_by(country) %>% summarise(users=sum(commuters.x),movers=sum(commuters.y),p_move=movers/users)
z = lads_map %>% mutate(n_commuters = unlist(UKcensus_flows_lads %>% group_by(from) %>% summarise(commuters=sum(freq) ) %>% ungroup %>% select(commuters)),
n_movers = unlist(UKcensus_flows_lads %>% filter(from!=to) %>% group_by(from) %>% summarise(commuters=sum(freq)) %>% ungroup %>% select(commuters)),
p_move=n_movers/n_commuters)
f1 = ggplot(st_sf(z[1:406,]),aes(fill=p_move)) + geom_sf(size=0.0) +
scale_fill_gradient2(name='Users (per capita)',midpoint=log10(mean(z$p_move)),low=muted('blue'),high=muted('red'),trans='log10') +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
theme(legend.position = "bottom") +
theme(text = element_text(size=8)) +
geom_sf(data=ukoutline,fill=NA,size=0.01)
f1
summary
## # A tibble: 4 x 4
## country users movers p_move
## <chr> <int> <int> <dbl>
## 1 E 23139206 9953850 0.430
## 2 N 724873 199052 0.275
## 3 S 2242725 619372 0.276
## 4 W 1263873 383303 0.303
A basic assumption of the human mobility models considered in this study is that commuter flows can be modelled based only on the spatial configuration of population density. This assumption breaks down for locations such as the City of London which have a comparatively small resident population but have an high density of workplaces, retail, recreational and other sites of interest. The exceptional nature of the City of London – and the adjacent district of Westminster – is evident from plotting the total influx and efflux of commuters for the English census data. Although there is some variation the vast majority of locations have an influx of commuters that roughly balances the efflux. In contrast the City of London and Westminster have influxs that are \(\sim\) 150 and 10 times larger than their efflux (top left). Human mobility models based only on population density cannot represent such features of real commuting data leading to a systematic lack of fit which may bias parameter estimates. As such we treat these LADs as outliers and remove from the data sets used for model estimation and comparison.
source('PPDfuncs.R')
load('./ERadiation/ERad_ECensus_all.RData')
load('./CDO/CDO_ECensus_all.RData')
ppCDO_all=ERadmatflux(Ecensus_mobility_dat,fitERad_CE_all,median=TRUE)
post_check<-tibble(census_out=rowSums(Ecensus_mobility_dat$mv),census_in=colSums(Ecensus_mobility_dat$mv),CDO_in = colSums(ppCDO_all$mean),CDO_out = rowSums(ppCDO_all$mean),lad=lads_map$lad17nm[1:326])
post_check$lad[!is.element(1:326,c(294,326))] = NA
p1=ggplot(post_check,aes(x=census_in,y=census_out,label=lad)) +
geom_point(color = ifelse(is.na(post_check$lad), "black", "red")) +
xlab('Influx (Census)') + ylab('Eflux (Census)')+ geom_label_repel()
p2=ggplot(post_check,aes(x=CDO_in,y=CDO_out,label=lad)) +
geom_point(color = ifelse(is.na(post_check$lad), "black", "red")) +
xlab('Influx (CDO)') + ylab('Eflux (CDO)') + geom_label_repel()
p3=ggplot(post_check,aes(x=census_in,y=CDO_in,label=lad)) +
geom_point(color = ifelse(is.na(post_check$lad), "black", "red")) +
xlab('Influx (Census)') + ylab('Influx (CDO)') + geom_label_repel()
(p1+p2)/p3
## Warning: Removed 324 rows containing missing values (geom_label_repel).
## Warning: Removed 324 rows containing missing values (geom_label_repel).
## Warning: Removed 324 rows containing missing values (geom_label_repel).
As an illustration we compare estimated posterior distributions for the CDO gravity model estimated from the English census data both with (326 LADs) and without (324 LADs) the City of London and Westminster. The systematic lack of fit to these two LADs is responsible for a huge bias in all of the estimated gravity scaling parameters.
post_compare <- fitCDO_CE_all %>% gather_draws(tau,rho,alpha,delta) %>% mutate(data='Census (E)',country='E',data='England (326 LADS)')
post_compare <- post_compare %>% bind_rows(fitCDO_CE %>% gather_draws(tau,rho,alpha,delta) %>% mutate(data='Census (E)',country='E',data='England (324 LADS)'))
post_compare <- post_compare %>% ungroup() %>% mutate(.variable = factor(.variable,levels=c('tau','rho','alpha','delta')))
cred95 <- function(x){return(data.frame(y=median(x),ymin=quantile(x,c(0.025)),ymax=quantile(x,0.975)))}
ggplot(post_compare,aes(x=data,y=.value)) + coord_flip() + stat_summary(fun.data=cred95,size=0.5, position = position_dodge(width=0.5)) + facet_wrap(~.variable,scales='free_x',labeller = label_parsed) + xlab('') + ylab('') + theme(legend.position = "none")
The approximate leave-one-out cross validation (LOO) method uses pareto smoothed importance sampling (PSIS-LOO) to efficiently estimate the predictive accuracy of a model (expected log pointwise predictive density, \(\hat{elpd}\)) and as a basis for model comparison and selection. The estimated shape parameter \(\hat{k}\) can be used to judge the reliability of the estimate of \(\hat{elpd}\) for each data point (or in our case for each LAD corresponding to a row of \(\Omega_{ji}\)). The estimate of \(\hat{elpd}\) is considered reliable (quick convergence) for \(\hat{k} < 0.5\), performance may still be reliable for values of \(\hat{k}\) up to 0.7. Values of \(\hat{k} > 0.7\) suggest that the data points are highly influential to the estimated posterior and potentially introducing bias.
After removal of the City of London and Westminster LADs from the English census data and the Highland LAD from the Scottish workflow data (for consistency with limitations of BBC data set) a set of 2-4 LADs still have values of \(\hat{k} > 0.7\) for the three gravity-type mobility models (CDE,CDO,CDP).
Gravity models are notoriously sensitive to the contribution from high density locations, borne out by the clustering of LADS with high pareto shape parameters around London and other high population density areas. The shape parameter (\(\phi\)) of the negative binomial likelihood used for estimation controls the variance to mean scaling relationship with the variance for a mean flux \(y\): \(y + \frac{y^2}{\phi}\). Hence reducing the value of \(\phi\) reduces the influence of larger populations (with larger population flux) to the likelihood. Hence, we carried out a sensitivity analysis fixing the value of \(\phi\) and reducing the value until all LADS \(\hat{k} < 0.7\). This was achieved for a fixed shape parameter of \(\phi=0.1\).
As below, fixing \(\phi\) reduces the absolute values of the likelihood and predictive accuracy of all of the models but does not change the rank ordering of the mobility models (section below). Reducing \(\phi\) increases the uncertainty but also systematically changes the estimated model parameters (illustrated for the Extended Radiation and CDO gravity models). Hence, as the posterior predictive accuracy as measured by CPC is higher for the model estimating \(\phi\) we use these estimates to compare with the BBC data.
The difference between \(\hat{elpd}\) for alternative models fitted to the same data provides a measure of their relative predictive accuracy.
lookup=array(c('CDE','CDP','CDO','ERad','IO','Imp','Sto'))
row.names(lookup)<- c('model1','model2','model3','model4','model5','model6','model7')
temp = loo_compare(loo_CDE_CE,loo_CDP_CE,loo_CDO_CE,loo_ERad_CE,loo_IO_CE,loo_Imp_CE,loo_Stoufer_CE)
looC_df <- data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep=''))
temp = loo_compare(loo_CDE_CW,loo_CDP_CW,loo_CDO_CW,loo_ERad_CW,loo_IO_CW,loo_Imp_CW,loo_Stoufer_CW)
looC_df <- cbind(looC_df,data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep='')))
temp = loo_compare(loo_CDE_CS,loo_CDP_CS,loo_CDO_CS,loo_ERad_CS,loo_IO_CS,loo_Imp_CS,loo_Stoufer_CS)
looC_df <- cbind(looC_df,data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep='')))
temp = loo_compare(loo_CDE_CNI,loo_CDP_CNI,loo_CDO_CNI,loo_ERad_CNI,loo_IO_CNI,loo_Imp_CNI,loo_Stoufer_CNI)
looC_df <- cbind(looC_df,data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep='')))
print.xtable(looC_df,file = './textables/looCensus.tex')
lookup=array(c('CDE','CDP','CDO','ERad','IO','Imp','Sto'))
row.names(lookup)<- c('model1','model2','model3','model4','model5','model6','model7')
temp = loo_compare(loo_CDE_CEphi1,loo_CDP_CEphi1,loo_CDO_CEphi1,loo_ERad_CEphi1,loo_IO_CEphi1,loo_Imp_CEphi1,loo_Sto_CEphi1)
looC_df_fixphi <- data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep=''))
temp = loo_compare(loo_CDE_CWphi1,loo_CDP_CWphi1,loo_CDO_CWphi1,loo_ERad_CWphi1,loo_IO_CWphi1,loo_Imp_CWphi1,loo_Sto_CWphi1)
looC_df_fixphi <- cbind(looC_df_fixphi,data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep='')))
temp = loo_compare(loo_CDE_CSphi1,loo_CDP_CSphi1,loo_CDO_CSphi1,loo_ERad_CSphi1,loo_IO_CSphi1,loo_Imp_CSphi1,loo_Sto_CSphi1)
looC_df_fixphi <- cbind(looC_df_fixphi,data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep='')))
temp = loo_compare(loo_CDE_CNIphi1,loo_CDP_CNIphi1,loo_CDO_CNIphi1,loo_ERad_CNIphi1,loo_IO_CNIphi1,loo_Imp_CNIphi1,loo_Sto_CNIphi1)
looC_df_fixphi <- cbind(looC_df_fixphi,data.frame(model=as.vector(lookup[names(temp[,1])]),elpd=paste(signif(temp[,1],3), ' (',signif(temp[,2],2),')',sep='')))
print.xtable(looC_df_fixphi,file = './textables/looCensus_fixphi.tex')